These Datasets are given by a company who wants an analyse a customer segmantation and made a churn model to avoid improve their performance. To start the project is important to create a copy of datasets in order to manipulate data without any risk to loose information. Then they will be described, analysed and visualized as follow.
client_fidelity <- read.csv(paste0(data_dir,"raw_1_cli_fid.csv"), sep=";")
fidelity_clean <- client_fidelity
formattable(head(client_fidelity))
| ID_CLI | ID_FID | ID_NEG | TYP_CLI_FID | COD_FID | STATUS_FID | DT_ACTIVE |
|---|---|---|---|---|---|---|
| 500 | 814583 | 32 | 1 | PREMIUM | 1 | 2019-02-23 |
| 16647 | 781106 | 44 | 1 | PREMIUM | 1 | 2019-02-02 |
| 835335 | 816369 | 28 | 1 | PREMIUM | 1 | 2019-02-23 |
| 9557 | 746573 | 9 | 1 | PREMIUM | 1 | 2019-01-11 |
| 767877 | 741522 | 41 | 1 | PREMIUM | 1 | 2019-01-07 |
| 743090 | 776971 | 2 | 1 | PREMIUM | 1 | 2019-01-30 |
formattable(summary(client_fidelity))
## ID_CLI ID_FID ID_NEG
## "Min. : 1 " "Min. : 3 " "Min. : 1.0 "
## "1st Qu.:230658 " "1st Qu.:229066 " "1st Qu.: 6.0 "
## "Median :462034 " "Median :458969 " "Median :23.0 "
## "Mean :462486 " "Mean :459425 " "Mean :22.1 "
## "3rd Qu.:693200 " "3rd Qu.:688435 " "3rd Qu.:36.0 "
## "Max. :934919 " "Max. :928121 " "Max. :49.0 "
## "NA " "NA " "NA "
## TYP_CLI_FID COD_FID STATUS_FID
## "Min. :0.0000 " "PREMIUM : 44029 " "Min. :0.00 "
## "1st Qu.:1.0000 " "PREMIUM BIZ : 6715 " "1st Qu.:1.00 "
## "Median :1.0000 " "STANDARD :290170 " "Median :1.00 "
## "Mean :0.9848 " "STANDARD BIZ: 29221 " "Mean :0.99 "
## "3rd Qu.:1.0000 " "NA " "3rd Qu.:1.00 "
## "Max. :1.0000 " "NA " "Max. :1.00 "
## "NA " "NA " "NA "
## DT_ACTIVE
## "2018-11-23: 3024 "
## "2018-04-07: 1457 "
## "2018-11-22: 1439 "
## "2018-03-11: 1438 "
## "2018-04-14: 1403 "
## "2018-04-28: 1403 "
## "(Other) :359971 "
We check for eventually duplicate rows in the dataset:
fidelity_clean_duplicate <- fidelity_clean %>%
dplyr::summarize(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ID_FIDs = n_distinct(ID_FID),
TOT_ID_CLIFIDs = n_distinct(paste0(as.character(ID_CLI),"-",as.character(ID_FID))),
TOT_ROWs = n())
formattable(fidelity_clean_duplicate)
| TOT_ID_CLIs | TOT_ID_FIDs | TOT_ID_CLIFIDs | TOT_ROWs |
|---|---|---|---|
| 369472 | 367925 | 370135 | 370135 |
There are no duplicates. Then we start the formatting of dates and boolean as factors.
fidelity_clean <- fidelity_clean %>%
mutate(DT_ACTIVE = as.Date(DT_ACTIVE)) %>%
mutate(TYP_CLI_FID = as.factor(TYP_CLI_FID)) %>%
mutate(STATUS_FID = as.factor(STATUS_FID))
We start a Consistency check.
## First step, count the subscriptions for each client
number_fidelity_client <- fidelity_clean %>%
group_by(ID_CLI) %>%
dplyr::summarize(NUM_FIDs = n_distinct(ID_FID)
, NUM_DATEs = n_distinct(DT_ACTIVE)
)
tot_id_cli <- n_distinct(number_fidelity_client$ID_CLI)
## Second step, compute the distribution of number of subscriptions
dist_number_fidelity_client <- number_fidelity_client %>%
group_by(NUM_FIDs, NUM_DATEs) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT_CLIs = TOT_CLIs/tot_id_cli)
formattable(dist_number_fidelity_client)
| NUM_FIDs | NUM_DATEs | TOT_CLIs | PERCENT_CLIs |
|---|---|---|---|
| 1 | 1 | 368833 | 0.99827050494 |
| 2 | 1 | 254 | 0.00068746752 |
| 2 | 2 | 363 | 0.00098248311 |
| 3 | 1 | 7 | 0.00001894596 |
| 3 | 2 | 8 | 0.00002165252 |
| 3 | 3 | 5 | 0.00001353283 |
| 4 | 1 | 2 | 0.00000541313 |
number_fidelity_client %>% filter(NUM_FIDs == 3) %>% head() %>% formattable()
| ID_CLI | NUM_FIDs | NUM_DATEs |
|---|---|---|
| 7533 | 3 | 3 |
| 11477 | 3 | 1 |
| 68556 | 3 | 1 |
| 96537 | 3 | 1 |
| 223203 | 3 | 3 |
| 250133 | 3 | 2 |
fidelity_clean %>% filter(ID_CLI == 621814) %>% formattable()
| ID_CLI | ID_FID | ID_NEG | TYP_CLI_FID | COD_FID | STATUS_FID | DT_ACTIVE |
|---|---|---|---|---|---|---|
| 621814 | 578123 | 1 | 1 | STANDARD | 0 | 2018-10-13 |
| 621814 | 646483 | 18 | 1 | STANDARD | 0 | 2018-11-13 |
| 621814 | 661124 | 18 | 1 | STANDARD | 1 | 2018-11-20 |
fidelity_clean %>% filter(ID_CLI == 320880) %>% formattable()
| ID_CLI | ID_FID | ID_NEG | TYP_CLI_FID | COD_FID | STATUS_FID | DT_ACTIVE |
|---|---|---|---|---|---|---|
| 320880 | 248462 | 8 | 1 | STANDARD | 0 | 2018-04-25 |
| 320880 | 250899 | 8 | 1 | PREMIUM | 0 | 2018-04-26 |
| 320880 | 250910 | 8 | 1 | STANDARD | 1 | 2018-04-26 |
This table show that actually there are clients with different subscriptions. Also is possible that the subscriptions have different dates or have the same dates probably for technical reason.
Is important to reshape the df, in order to combine every information from the last subscription as type of fidelity, status, to the first subscription registration date, store for registration and in the end the count of the subscriptions made. We left join df_1_cli_fid_first and number_fidelity_client in df_1_cli_fid_last:
df_1_cli_fid_first <- fidelity_clean %>%
group_by(ID_CLI) %>%
filter(DT_ACTIVE == min(DT_ACTIVE)) %>%
arrange(ID_FID) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
df_1_cli_fid_last <- fidelity_clean %>%
group_by(ID_CLI) %>%
filter(DT_ACTIVE == max(DT_ACTIVE)) %>%
arrange(desc(ID_FID)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
fidelity_clean <- df_1_cli_fid_last %>%
select(ID_CLI
, ID_FID
, LAST_COD_FID = COD_FID
, LAST_TYP_CLI_FID = TYP_CLI_FID
, LAST_STATUS_FID = STATUS_FID
, LAST_DT_ACTIVE = DT_ACTIVE) %>%
left_join(df_1_cli_fid_first %>%
select(ID_CLI
, FIRST_ID_NEG = ID_NEG
, FIRST_DT_ACTIVE = DT_ACTIVE)
, by = 'ID_CLI') %>%
left_join(number_fidelity_client %>%
select(ID_CLI
, NUM_FIDs) %>%
mutate(NUM_FIDs = as.factor(NUM_FIDs))
, by = 'ID_CLI')
## compute distribution
df1_dist_codfid <- fidelity_clean %>%
group_by(LAST_COD_FID) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
df1_dist_codfid %>% formattable()
| LAST_COD_FID | TOT_CLIs | PERCENT |
|---|---|---|
| STANDARD | 289756 | 0.78424346 |
| PREMIUM | 43878 | 0.11875866 |
| STANDARD BIZ | 29148 | 0.07889096 |
| PREMIUM BIZ | 6690 | 0.01810692 |
ggplot(data=df1_dist_codfid, aes(x=LAST_COD_FID, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
df1_dist_codfid <- fidelity_clean %>%
group_by(NUM_FIDs) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
df1_dist_codfid %>% formattable()
| NUM_FIDs | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 368833 | 0.99827050494 |
| 2 | 617 | 0.00166995063 |
| 3 | 20 | 0.00005413130 |
| 4 | 2 | 0.00000541313 |
ggplot(data=df1_dist_codfid, aes(x=NUM_FIDs, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
df1_dist_codfid <- fidelity_clean %>%
group_by(substring(LAST_DT_ACTIVE,1,4)) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
rename(Year = `substring(LAST_DT_ACTIVE, 1, 4)`)
df1_dist_codfid %>% formattable()
| Year | TOT_CLIs | PERCENT |
|---|---|---|
| 2018 | 294855 | 0.7980442 |
| 2019 | 74617 | 0.2019558 |
ggplot(data=df1_dist_codfid, aes(x=Year, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
df1_dist_codfid <- fidelity_clean %>%
group_by(LAST_STATUS_FID) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
df1_dist_codfid %>% formattable()
| LAST_STATUS_FID | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 366413 | 0.991720618 |
| 0 | 3059 | 0.008279382 |
ggplot(data=df1_dist_codfid, aes(x=LAST_STATUS_FID, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5) +
theme_minimal()
#### INGESTION df_2 customers accounts details ####
client_account <- read.csv(paste0(data_dir,"raw_2_cli_account.csv"), sep=";")
account_clean <- client_account
First check for duplicates.
account_clean %>%
dplyr::summarize(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ROWs = n()) %>%
formattable()
| TOT_ID_CLIs | TOT_ROWs |
|---|---|
| 369472 | 369472 |
There are no duplicates. Then we start to format the columns and check for NA.
account_clean <- account_clean %>%
mutate(W_PHONE = as.factor(W_PHONE)) %>%
mutate(TYP_CLI_ACCOUNT = as.factor(TYP_CLI_ACCOUNT))
summary(account_clean)
## ID_CLI EMAIL_PROVIDER W_PHONE ID_ADDRESS
## Min. : 1 gmail.com :151508 1 :342167 Min. : 1
## 1st Qu.:230783 libero.it : 57782 NA's: 27305 1st Qu.:227903
## Median :462062 hotmail.it : 28698 Median :456720
## Mean :462541 alice.it : 18127 Mean :457283
## 3rd Qu.:693197 yahoo.it : 16538 3rd Qu.:686533
## Max. :934919 hotmail.com: 10076 Max. :900091
## (Other) : 86743
## TYP_CLI_ACCOUNT TYP_JOB
## 2: 35816 :360810
## 4:333656 Libero professionista: 3970
## Impiegato/a : 1560
## Altro : 784
## Pensionato/a : 641
## Operaio/a : 482
## (Other) : 1225
Actualy, there are several NA we need to handle.
account_clean <- account_clean %>%
mutate(W_PHONE = fct_explicit_na(W_PHONE, "0")) %>%
mutate(EMAIL_PROVIDER = fct_explicit_na(EMAIL_PROVIDER, "(missing)")) %>%
mutate(TYP_JOB = fct_explicit_na(TYP_JOB, "(missing)"))
We start a Consistency check with the df1 and df2.
consistency_df1_df2 <- fidelity_clean %>%
select(ID_CLI) %>%
mutate(is_in_df_1 = 1) %>%
distinct() %>%
full_join(account_clean %>%
select(ID_CLI) %>%
mutate(is_in_df_2 = 1) %>%
distinct(),
by = "ID_CLI"
) %>%
group_by(is_in_df_1, is_in_df_2) %>%
dplyr::summarize(NUM_ID_CLIs = n_distinct(ID_CLI)) %>%
as.data.frame()
## `summarise()` regrouping output by 'is_in_df_1' (override with `.groups` argument)
consistency_df1_df2 %>% formattable()
| is_in_df_1 | is_in_df_2 | NUM_ID_CLIs |
|---|---|---|
| 1 | 1 | 369472 |
We can conclude with a perfect consistency. All the ID_CLI in df_1 are also in df_2 and the opposite too. We reshape the dataframe in order to obtain new info. We keep the most frequent EMAIL_PROVIDER values and add a common factor level OTHER for the remaining.
df_2_dist_emailprovider <- account_clean %>%
group_by(EMAIL_PROVIDER) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
df_2_dist_emailprovider %>%
arrange(desc(PERCENT)) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>%
as.data.frame() %>%
head(20) %>%
formattable()
| EMAIL_PROVIDER | TOT_CLIs | PERCENT | PERCENT_COVERED |
|---|---|---|---|
| gmail.com | 151508 | 0.410066257 | 0.4100663 |
| libero.it | 57782 | 0.156390741 | 0.5664570 |
| hotmail.it | 28698 | 0.077673004 | 0.6441300 |
| alice.it | 18127 | 0.049061905 | 0.6931919 |
| yahoo.it | 16538 | 0.044761173 | 0.7379531 |
| hotmail.com | 10076 | 0.027271349 | 0.7652244 |
| virgilio.it | 9161 | 0.024794842 | 0.7900193 |
| tiscali.it | 8733 | 0.023636433 | 0.8136557 |
| live.it | 7936 | 0.021479300 | 0.8351350 |
| 5889 | 0.015938962 | 0.8510740 | |
| icloud.com | 3735 | 0.010109020 | 0.8611830 |
| yahoo.com | 3259 | 0.008820695 | 0.8700037 |
| gmail.it | 2266 | 0.006133076 | 0.8761368 |
| tin.it | 2183 | 0.005908431 | 0.8820452 |
| outlook.it | 2039 | 0.005518686 | 0.8875639 |
| fastwebnet.it | 1749 | 0.004733782 | 0.8922977 |
| inwind.it | 1514 | 0.004097739 | 0.8963954 |
| email.it | 1103 | 0.002985341 | 0.8993807 |
| me.com | 1034 | 0.002798588 | 0.9021793 |
| live.com | 837 | 0.002265395 | 0.9044447 |
We keep the missing level for technical reasons and select levels that cover 85% of the cases.
clean_email_providers <- df_2_dist_emailprovider %>%
arrange(desc(PERCENT)) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>%
mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER)) %>%
mutate(AUX = if_else(PERCENT_COVERED < 0.85 | (PERCENT_COVERED > 0.85 & lag(PERCENT_COVERED) < 0.85), 1,0)) %>%
mutate(EMAIL_PROVIDER_CLEAN = if_else(AUX | EMAIL_PROVIDER == "(missing)", EMAIL_PROVIDER, "others"))
formattable(head(clean_email_providers, 5))
| EMAIL_PROVIDER | TOT_CLIs | PERCENT | PERCENT_COVERED | AUX | EMAIL_PROVIDER_CLEAN |
|---|---|---|---|---|---|
| gmail.com | 151508 | 0.41006626 | 0.4100663 | 1 | gmail.com |
| libero.it | 57782 | 0.15639074 | 0.5664570 | 1 | libero.it |
| hotmail.it | 28698 | 0.07767300 | 0.6441300 | 1 | hotmail.it |
| alice.it | 18127 | 0.04906190 | 0.6931919 | 1 | alice.it |
| yahoo.it | 16538 | 0.04476117 | 0.7379531 | 1 | yahoo.it |
Then we add from the start df the EMAIL_PROVIDER.
account_clean <- account_clean %>%
mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER)) %>%
left_join(clean_email_providers %>%
select(EMAIL_PROVIDER, EMAIL_PROVIDER_CLEAN)
, by = "EMAIL_PROVIDER") %>%
mutate(EMAIL_PROVIDER_CLEAN = as.factor(EMAIL_PROVIDER_CLEAN))
## compute distribution
plot_df2 <- account_clean %>%
group_by(EMAIL_PROVIDER) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
## `summarise()` ungrouping output (override with `.groups` argument)
plot_df2 %>% head() %>% formattable()
| EMAIL_PROVIDER | TOT_CLIs | PERCENT |
|---|---|---|
| gmail.com | 151508 | 0.41006626 |
| libero.it | 57782 | 0.15639074 |
| hotmail.it | 28698 | 0.07767300 |
| alice.it | 18127 | 0.04906190 |
| yahoo.it | 16538 | 0.04476117 |
| hotmail.com | 10076 | 0.02727135 |
plot_df21 <- plot_df2 %>% head()
ggplot(data=plot_df21, aes(x=EMAIL_PROVIDER, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
There are too many different values for EMAIL_PROVIDER to be an useful category, but if we focus on the first 6 company we can se that gmail is the most used.
# TYPE JOB
plot_df2 <- account_clean %>%
group_by(TYP_JOB) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
plot_df2 %>% head() %>% formattable()
| TYP_JOB | TOT_CLIs | PERCENT |
|---|---|---|
| 360810 | 0.976555734 | |
| Libero professionista | 3970 | 0.010745063 |
| Impiegato/a | 1560 | 0.004222241 |
| Altro | 784 | 0.002121947 |
| Pensionato/a | 641 | 0.001734908 |
| Operaio/a | 482 | 0.001304564 |
ggplot(data=plot_df2, aes(x=TYP_JOB, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
# 15 DIFFERENT TYPE JOB BUT THEY ARE ONLY 3% OF THE TOTAL DATA HAVE A JOB. USELESS
# W_PHONE
plot_df2 <- account_clean %>%
group_by(W_PHONE) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
plot_df2 %>% head() %>% formattable()
| W_PHONE | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 342167 | 0.92609724 |
| 0 | 27305 | 0.07390276 |
ggplot(data=plot_df2, aes(x=W_PHONE, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
# INTERESTING VARIABLE FOR MARKETING PURPOUSE ONLY 7% HAVE NOT RELEASE THE TELEPHONE NUMBER
Now we use the aggregated data from the reshaping so with the cleaning of the variables.
## compute distribution
plot_df2 <- account_clean %>%
group_by(EMAIL_PROVIDER_CLEAN) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_df2 %>% head() %>% formattable()
| EMAIL_PROVIDER_CLEAN | TOT_CLIs | PERCENT |
|---|---|---|
| gmail.com | 151508 | 0.41006626 |
| libero.it | 57782 | 0.15639074 |
| others | 55024 | 0.14892603 |
| hotmail.it | 28698 | 0.07767300 |
| alice.it | 18127 | 0.04906190 |
| yahoo.it | 16538 | 0.04476117 |
ggplot(data=plot_df2, aes(x=EMAIL_PROVIDER_CLEAN, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
As confirmed before Gmail is the most used provider.
# TYPE_CLI_ACCOUNT
## compute distribution
plot_df2 <- account_clean %>%
group_by(TYP_CLI_ACCOUNT) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
plot_df2 %>% head() %>% formattable()
| TYP_CLI_ACCOUNT | TOT_CLIs | PERCENT |
|---|---|---|
| 4 | 333656 | 0.90306167 |
| 2 | 35816 | 0.09693833 |
ggplot(data=plot_df2, aes(x=TYP_CLI_ACCOUNT, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
There is more type 4 than 2.
#### INGESTION df_3 customers addresses ####
client_address <- read.csv(paste0(data_dir,"raw_3_cli_address.csv"), sep=";")
df_3_cli_address_clean <- client_address
Check duplicates and clean it.
df_3_cli_address_clean %>%
dplyr::summarize(TOT_ID_ADDRESSes = n_distinct(ID_ADDRESS),
TOT_ROWs = n()) %>%
formattable()
| TOT_ID_ADDRESSes | TOT_ROWs |
|---|---|
| 361330 | 1211332 |
df_3_cli_address_clean <- df_3_cli_address_clean %>% distinct()
Format String.
df_3_cli_address_clean <- df_3_cli_address_clean %>%
mutate(CAP = as.character(CAP))
Check for Missing Values
str(df_3_cli_address_clean)
## 'data.frame': 361332 obs. of 4 variables:
## $ ID_ADDRESS: int 1337 1344 1347 1352 1353 1355 1361 1379 1384 1387 ...
## $ CAP : chr "20083" "20024" "20090" "20123" ...
## $ PRV : Factor w/ 241 levels "","-",".","06061",..: 114 114 114 114 114 114 123 114 114 114 ...
## $ REGION : Factor w/ 21 levels "","ABRUZZO","BASILICATA",..: 10 10 10 10 10 10 13 10 10 10 ...
summary(df_3_cli_address_clean)
## ID_ADDRESS CAP PRV REGION
## Min. : 1 Length:361332 MI : 38850 LOMBARDIA: 97181
## 1st Qu.:224025 Class :character RM : 29529 LAZIO : 32058
## Median :448623 Mode :character : 23269 CAMPANIA : 30570
## Mean :449067 TO : 18322 VENETO : 29696
## 3rd Qu.:673822 PA : 17448 SICILIA : 28329
## Max. :900090 (Other):211540 PIEMONTE : 24377
## NA's : 22374 (Other) :119121
df_3_cli_address_clean %>%
group_by(w_CAP = !is.na(CAP)
, w_PRV = !is.na(PRV)
, w_REGION = !is.na(REGION)) %>%
dplyr::summarize(TOT_ADDs = n_distinct(ID_ADDRESS)) %>%
formattable()
| w_CAP | w_PRV | w_REGION | TOT_ADDs |
|---|---|---|---|
| TRUE | FALSE | TRUE | 22374 |
| TRUE | TRUE | TRUE | 338956 |
Clean Missing values
df_3_cli_address_clean <- df_3_cli_address_clean %>%
filter(!is.na(CAP) & !is.na(PRV) & !is.na(REGION))
Check Consistency beetween dataset2 and dataset3
cons_idaddress_df2_df3 <- account_clean %>%
select(ID_ADDRESS) %>%
mutate(is_in_df_2 = 1) %>%
distinct() %>%
full_join(df_3_cli_address_clean %>%
select(ID_ADDRESS) %>%
mutate(is_in_df_3 = 1) %>%
distinct()
, by = "ID_ADDRESS"
) %>%
group_by(is_in_df_2, is_in_df_3) %>%
dplyr::summarize(NUM_ID_ADDRESSes = n_distinct(ID_ADDRESS)) %>%
as.data.frame()
cons_idaddress_df2_df3 %>% formattable()
| is_in_df_2 | is_in_df_3 | NUM_ID_ADDRESSes |
|---|---|---|
| 1 | 1 | 338956 |
| 1 | NA | 22429 |
## REGION
## compute distribution
df2_dist_3 <- df_3_cli_address_clean %>%
group_by(REGION) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_ADDRESS)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
df2_dist_3 %>% formattable() %>% head()
| REGION | TOT_CLIs | PERCENT |
|---|---|---|
| LOMBARDIA | 97181 | 0.28670514 |
| LAZIO | 32058 | 0.09457809 |
| VENETO | 29696 | 0.08760967 |
| SICILIA | 28329 | 0.08357673 |
| PIEMONTE | 24377 | 0.07191746 |
| 23864 | 0.07040400 |
head3 <- df2_dist_3 %>% head(6)
ggplot(data=head3, aes(x=REGION, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
## PRV
## compute distribution
df2_dist_3 <- df_3_cli_address_clean %>%
group_by(PRV) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_ADDRESS)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
df2_dist_3 %>% formattable() %>% head()
| PRV | TOT_CLIs | PERCENT |
|---|---|---|
| MI | 38850 | 0.11461597 |
| RM | 29529 | 0.08711699 |
| 23269 | 0.06864862 | |
| TO | 18322 | 0.05405389 |
| PA | 17448 | 0.05147540 |
| MB | 14751 | 0.04351867 |
head3 <- df2_dist_3 %>% head(6)
ggplot(data=head3, aes(x=PRV, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
#### INGESTION df_4 customers privacy data ####
client_privacy <- read.csv(paste0(data_dir,"raw_4_cli_privacy.csv"), sep=";")
df_4_cli_privacy_clean <- client_privacy
Check Duplicates
df_4_cli_privacy_clean %>%
dplyr::summarize(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ROWs = n()) %>%
formattable()
| TOT_ID_CLIs | TOT_ROWs |
|---|---|
| 369472 | 369472 |
No Duplicates.
Format columns.
df_4_cli_privacy_clean <- df_4_cli_privacy_clean %>%
mutate(FLAG_PRIVACY_1 = as.factor(FLAG_PRIVACY_1)) %>%
mutate(FLAG_PRIVACY_2 = as.factor(FLAG_PRIVACY_2)) %>%
mutate(FLAG_DIRECT_MKT = as.factor(FLAG_DIRECT_MKT))
Consistency check between dataset1 and dataset4
cons_idcli_df1_df4 <- fidelity_clean %>%
select(ID_CLI) %>%
mutate(is_in_df_1 = 1) %>%
distinct() %>%
full_join(df_4_cli_privacy_clean %>%
select(ID_CLI) %>%
mutate(is_in_df_4 = 1) %>%
distinct()
, by = "ID_CLI"
) %>%
group_by(is_in_df_1, is_in_df_4) %>%
dplyr::summarize(NUM_ID_CLIs = n_distinct(ID_CLI)) %>%
as.data.frame()
cons_idcli_df1_df4
## is_in_df_1 is_in_df_4 NUM_ID_CLIs
## 1 1 1 369472
df4_dist_codfid <- df_4_cli_privacy_clean %>%
mutate(FLAG_PRIVACY_1 = as.integer(FLAG_PRIVACY_1)) %>%
mutate(FLAG_PRIVACY_2 = as.integer(FLAG_PRIVACY_2)) %>%
mutate(FLAG_DIRECT_MKT = as.integer(FLAG_DIRECT_MKT)) %>%
summarise_all(sum)
df4_dist_codfid <- t(df4_dist_codfid)
df4_dist_codfid <- as.data.frame(df4_dist_codfid)
a <- c("NULL", "FLAG_PRIVACY_1", "FLAG_PRIVACY_2", "FLAG_DIRECT_MKT")
df4_dist_codfid %>% formattable()
| V1 | |
|---|---|
| ID_CLI | 170895923424 |
| FLAG_PRIVACY_1 | 611723 |
| FLAG_PRIVACY_2 | 715154 |
| FLAG_DIRECT_MKT | 617262 |
df4_dist_codfid <- cbind(df4_dist_codfid ,a)
df4_dist_codfid <- df4_dist_codfid[-c(1),]
ggplot(data=df4_dist_codfid, aes(x=a, y=V1)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=V1), vjust=1.6, color="white", size=3.5)+
theme_minimal()
#### INGESTION df_5 email campaign descriptions ####
campaign_cat <- read.csv(paste0(data_dir,"raw_5_camp_cat.csv"), sep=";")
campaign_category_clean <- campaign_cat
We check NA
str(campaign_category_clean)
## 'data.frame': 848 obs. of 3 variables:
## $ ID_CAMP : int 757 759 760 761 762 763 764 765 767 769 ...
## $ TYP_CAMP : Factor w/ 5 levels "LOCAL","NATIONAL",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ CHANNEL_CAMP: Factor w/ 1 level "EMAIL": 1 1 1 1 1 1 1 1 1 1 ...
summary(campaign_category_clean)
## ID_CAMP TYP_CAMP CHANNEL_CAMP
## Min. : 5.0 LOCAL : 48 EMAIL:848
## 1st Qu.: 327.8 NATIONAL :150
## Median : 561.5 NEWSLETTER :109
## Mean : 559.6 PERSONALIZED:169
## 3rd Qu.: 812.2 PRODUCT :372
## Max. :1052.0
We find out that CHANNEL_CAMP column is not important so we remove it.
campaign_category_clean <- campaign_category_clean %>%
select(-CHANNEL_CAMP)
plot5 <- campaign_category_clean %>%
group_by(TYP_CAMP) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CAMP)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
plot5 %>% head() %>% formattable()
| TYP_CAMP | TOT_CLIs | PERCENT |
|---|---|---|
| PRODUCT | 372 | 0.43867925 |
| PERSONALIZED | 169 | 0.19929245 |
| NATIONAL | 150 | 0.17688679 |
| NEWSLETTER | 109 | 0.12853774 |
| LOCAL | 48 | 0.05660377 |
ggplot(data=plot5, aes(x=TYP_CAMP, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
#### INGESTION df_6 email events ####
email_event <- read.csv(paste0(data_dir,"raw_6_camp_event.csv"), sep=";")
campaign_event_clean <- email_event
We format the EVENT_DATETIME as date and then we extrapolate hour and date.
campaign_event_clean <- campaign_event_clean %>%
mutate(EVENT_DATETIME = as.POSIXct(EVENT_DATE, format="%Y-%m-%dT%H:%M:%S")) %>%
mutate(EVENT_HOUR = hour(EVENT_DATETIME)) %>%
mutate(EVENT_DATE = as.Date(EVENT_DATETIME))
We strart to check the Consistency between dataset1 and dataset6.
cons_idcli_df1_df6 <- fidelity_clean %>%
select(ID_CLI) %>%
distinct() %>%
mutate(is_in_df_1 = 1) %>%
distinct() %>%
full_join(campaign_event_clean %>%
select(ID_CLI) %>%
distinct() %>%
mutate(is_in_df_6 = 1) %>%
distinct()
, by = "ID_CLI"
) %>%
group_by(is_in_df_1, is_in_df_6) %>%
dplyr::summarize(NUM_ID_CLIs = n_distinct(ID_CLI)) %>%
as.data.frame()
cons_idcli_df1_df6 %>% formattable()
| is_in_df_1 | is_in_df_6 | NUM_ID_CLIs |
|---|---|---|
| 1 | 1 | 202023 |
| 1 | NA | 167449 |
We see that all dataset6 is in dataset1 but not vice-versa. Than we check the the consistency beetween df6 and df5.
cons_idcamp_df5_df6 <- campaign_category_clean %>%
select(ID_CAMP) %>%
distinct() %>%
mutate(is_in_df_5 = 1) %>%
distinct() %>%
full_join(campaign_event_clean %>%
select(ID_CAMP) %>%
distinct() %>%
mutate(is_in_df_6 = 1) %>%
distinct()
, by = "ID_CAMP"
) %>%
group_by(is_in_df_5, is_in_df_6) %>%
dplyr::summarize(NUM_ID_CAMPs = n_distinct(ID_CAMP)) %>%
as.data.frame()
cons_idcamp_df5_df6 %>% formattable()
| is_in_df_5 | is_in_df_6 | NUM_ID_CAMPs |
|---|---|---|
| 1 | 1 | 149 |
| 1 | NA | 699 |
Same result for df5 and df6. df6 is contain in df5 but not the opposite. Now we start with the reshaping of the dataset.
## remapping TYPE_EVENT values "E" [ERROR] and "B" [BOUNCE] into a level "F" [FAILURE] ##
campaign_event_clean <- campaign_event_clean %>%
mutate(TYP_EVENT = as.factor(if_else(TYP_EVENT == "E" | TYP_EVENT == "B", "F", as.character(TYP_EVENT))))
## adding type from df_5 ##
campaign_event_clean <- campaign_event_clean %>%
left_join(campaign_category_clean
, by = "ID_CAMP")
We are gonna organize the data adding to each sending event the corresponding opens/clicks/fails.
df_sends <- campaign_event_clean %>%
filter(TYP_EVENT == "S") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_S = ID_EVENT
, ID_CLI
, ID_CAMP
, TYP_CAMP
, ID_DELIVERY
, SEND_DATE = EVENT_DATE) %>%
as.data.frame()
df_opens_prep <- campaign_event_clean %>%
filter(TYP_EVENT == "V") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_O = ID_EVENT
, ID_CLI
, ID_CAMP
, TYP_CAMP
, ID_DELIVERY
, OPEN_DATETIME = EVENT_DATETIME
, OPEN_DATE = EVENT_DATE)
total_opens <- df_opens_prep %>%
group_by(ID_CLI
, ID_CAMP
, ID_DELIVERY) %>%
dplyr::summarize(NUM_OPENs = n_distinct(ID_EVENT_O))
df_opens <- df_opens_prep %>%
left_join(total_opens
, by = c("ID_CLI", "ID_CAMP", "ID_DELIVERY")) %>%
group_by(ID_CLI
, ID_CAMP
, ID_DELIVERY) %>%
filter(OPEN_DATETIME == min(OPEN_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
# clicks
# there could be multiple clicks of the same communication
# 1- count the click events
# 2- consider explicitely only the first click
df_clicks_prep <- campaign_event_clean %>%
filter(TYP_EVENT == "C") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_C = ID_EVENT
, ID_CLI
, ID_CAMP
, TYP_CAMP
, ID_DELIVERY
, CLICK_DATETIME = EVENT_DATETIME
, CLICK_DATE = EVENT_DATE)
total_clicks <- df_clicks_prep %>%
group_by(ID_CLI
, ID_CAMP
, ID_DELIVERY) %>%
dplyr::summarize(NUM_CLICKs = n_distinct(ID_EVENT_C))
df_clicks <- df_clicks_prep %>%
left_join(total_clicks
, by = c("ID_CLI", "ID_CAMP", "ID_DELIVERY")) %>%
group_by(ID_CLI
, ID_CAMP
, ID_DELIVERY) %>%
filter(CLICK_DATETIME == min(CLICK_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
# fails
df_fails <- campaign_event_clean %>%
filter(TYP_EVENT == "F") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_F = ID_EVENT
, ID_CLI
, ID_CAMP
, TYP_CAMP
, ID_DELIVERY
, FAIL_DATETIME = EVENT_DATETIME
, FAIL_DATE = EVENT_DATE) %>%
group_by(ID_CLI, ID_CAMP, ID_DELIVERY) %>%
filter(FAIL_DATETIME == min(FAIL_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
# combine sends opens clicks and fails
campaign_event_clean_final6 <- df_sends %>%
left_join(df_opens
, by = c("ID_CLI", "ID_CAMP", "ID_DELIVERY", "TYP_CAMP")
) %>%
filter(is.na(OPEN_DATE) | SEND_DATE <= OPEN_DATE) %>%
left_join(df_clicks
, by = c("ID_CLI", "ID_CAMP", "ID_DELIVERY", "TYP_CAMP")
) %>%
filter(is.na(CLICK_DATE) | OPEN_DATE <= CLICK_DATE) %>%
left_join(df_fails
, by = c("ID_CLI", "ID_CAMP", "ID_DELIVERY", "TYP_CAMP")
) %>%
filter(is.na(FAIL_DATE) | SEND_DATE <= FAIL_DATE) %>%
mutate(OPENED = !is.na(ID_EVENT_O)) %>%
mutate(CLICKED = !is.na(ID_EVENT_C)) %>%
mutate(FAILED = !is.na(ID_EVENT_F)) %>%
mutate(DAYS_TO_OPEN = as.integer(OPEN_DATE - SEND_DATE)) %>%
select(ID_EVENT_S
, ID_CLI
, ID_CAMP
, TYP_CAMP
, ID_DELIVERY
, SEND_DATE
, OPENED
, OPEN_DATE
, DAYS_TO_OPEN
, NUM_OPENs
, CLICKED
, CLICK_DATE
, NUM_CLICKs
, FAILED
)
## compute aggregate
df6_overview <- campaign_event_clean_final6 %>%
dplyr::summarize(MIN_DATE = min(SEND_DATE)
, MAX_DATE = max(SEND_DATE)
, TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI))
df6_overview %>% formattable()
| MIN_DATE | MAX_DATE | TOT_EVENTs | TOT_CLIs |
|---|---|---|---|
| 2019-01-03 | 2019-04-30 | 1556646 | 190427 |
df6_overviewbytyp <- campaign_event_clean_final6 %>%
group_by(TYP_CAMP) %>%
dplyr::summarize(MIN_DATE = min(SEND_DATE)
, MAX_DATE = max(SEND_DATE)
, TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI))
df6_overviewbytyp %>% head() %>% formattable()
| TYP_CAMP | MIN_DATE | MAX_DATE | TOT_EVENTs | TOT_CLIs |
|---|---|---|---|---|
| LOCAL | 2019-02-02 | 2019-04-02 | 151719 | 87894 |
| NATIONAL | 2019-01-07 | 2019-04-23 | 833085 | 177153 |
| PERSONALIZED | 2019-01-03 | 2019-04-30 | 194840 | 133908 |
| PRODUCT | 2019-01-03 | 2019-04-25 | 377002 | 69724 |
ggplot(data=df6_overviewbytyp, aes(x=TYP_CAMP, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5)+
theme_minimal()
### Variable OPENED ###
## compute aggregate
df6_dist_opened <- campaign_event_clean_final6 %>%
group_by(OPENED) %>%
dplyr::summarize(TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(TYP_CAMP = 'ALL') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/df6_overview$TOT_EVENTs
, PERCENT_CLIs = TOT_CLIs/df6_overview$TOT_CLIs)
df6_dist_opened %>% head() %>% formattable()
| OPENED | TOT_EVENTs | TOT_CLIs | TYP_CAMP | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| FALSE | 1278264 | 178378 | ALL | 0.8211655 | 0.9367264 |
| TRUE | 278382 | 83420 | ALL | 0.1788345 | 0.4380681 |
ggplot(data=df6_dist_opened, aes(fill=OPENED, x=TYP_CAMP, y=TOT_EVENTs)) +
geom_bar(stat="identity", position ="fill") +
theme_minimal()
### Variable OPENED by TYP_CAMP ###
## compute aggregate
df6_dist_openedbytyp <- campaign_event_clean_final6 %>%
group_by(TYP_CAMP, OPENED) %>%
dplyr::summarize(TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df6_overviewbytyp %>%
select(TYP_CAMP
, ALL_TOT_EVENTs = TOT_EVENTs
, ALL_TOT_CLIs = TOT_CLIs)
, by='TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(TYP_CAMP
, OPENED
, TOT_EVENTs
, TOT_CLIs
, PERCENT_EVENTs
, PERCENT_CLIs
)
df6_dist_openedbytyp %>% head() %>% formattable()
| TYP_CAMP | OPENED | TOT_EVENTs | TOT_CLIs | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| LOCAL | FALSE | 126700 | 76835 | 0.8350965 | 0.8741780 |
| LOCAL | TRUE | 25019 | 18029 | 0.1649035 | 0.2051221 |
| NATIONAL | FALSE | 710721 | 162049 | 0.8531194 | 0.9147404 |
| NATIONAL | TRUE | 122364 | 62964 | 0.1468806 | 0.3554216 |
| PERSONALIZED | FALSE | 156431 | 111942 | 0.8028690 | 0.8359620 |
| PERSONALIZED | TRUE | 38409 | 31327 | 0.1971310 | 0.2339442 |
ggplot(data=df6_dist_openedbytyp, aes(fill=OPENED, x=TYP_CAMP, y=TOT_EVENTs)) +
geom_bar(stat="identity") +
theme_minimal()
ggplot(data=df6_dist_openedbytyp, aes(fill=OPENED, x=TYP_CAMP, y=TOT_EVENTs)) +
geom_bar(stat="identity", position ="fill") +
theme_minimal()
### Variable DAYS_TO_OPEN
## compute aggregate
df6_dist_daystoopen <- campaign_event_clean_final6 %>%
filter(OPENED) %>%
group_by(ID_CLI) %>%
dplyr::summarize(AVG_DAYS_TO_OPEN = floor(mean(DAYS_TO_OPEN))) %>%
ungroup() %>%
group_by(AVG_DAYS_TO_OPEN) %>%
dplyr::summarize(TOT_CLIs = n_distinct(ID_CLI))
df6_dist_daystoopen %>% head() %>% formattable()
| AVG_DAYS_TO_OPEN | TOT_CLIs |
|---|---|
| 0 | 53379 |
| 1 | 12221 |
| 2 | 4849 |
| 3 | 3036 |
| 4 | 2131 |
| 5 | 1467 |
ggplot(data=df6_dist_daystoopen %>% filter(AVG_DAYS_TO_OPEN < 14),
aes(x=AVG_DAYS_TO_OPEN, y=TOT_CLIs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5) +
theme_minimal()
### DAYS_TO_OPEN vs CUMULATE PERCENT ###
## compute aggregate
df6_dist_daystoopen_vs_cumulate <- df6_dist_daystoopen %>%
arrange(AVG_DAYS_TO_OPEN) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs))
## plot aggregate
plot_df6_dist_daystoopen_vs_cumulate <- (
ggplot(data=df6_dist_daystoopen_vs_cumulate %>%
filter(AVG_DAYS_TO_OPEN < 14)
, aes(x=AVG_DAYS_TO_OPEN, y=PERCENT_COVERED)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=seq(0,14,2), minor_breaks=0:14) +
theme_minimal()
)
ggplot(data=df6_dist_daystoopen_vs_cumulate %>% filter(AVG_DAYS_TO_OPEN < 14),
aes(x=AVG_DAYS_TO_OPEN, y=PERCENT_COVERED)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=seq(0,14,2), minor_breaks=0:14) +
theme_minimal()
# - CLICKED/CLICKED by TYP_CAMP
## compute aggregate
df6_dist_openedbytyp <- campaign_event_clean_final6 %>%
group_by(TYP_CAMP, CLICKED) %>%
dplyr::summarize(TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df6_overviewbytyp %>%
select(TYP_CAMP
, ALL_TOT_EVENTs = TOT_EVENTs
, ALL_TOT_CLIs = TOT_CLIs)
, by='TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(TYP_CAMP
, CLICKED
, TOT_EVENTs
, TOT_CLIs
, PERCENT_EVENTs
, PERCENT_CLIs
)
df6_dist_openedbytyp %>% head() %>% formattable()
| TYP_CAMP | CLICKED | TOT_EVENTs | TOT_CLIs | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| LOCAL | FALSE | 150374 | 87332 | 0.991134927 | 0.99360593 |
| LOCAL | TRUE | 1345 | 1280 | 0.008865073 | 0.01456300 |
| NATIONAL | FALSE | 815796 | 175939 | 0.979247016 | 0.99314717 |
| NATIONAL | TRUE | 17289 | 14216 | 0.020752984 | 0.08024702 |
| PERSONALIZED | FALSE | 192741 | 133043 | 0.989227058 | 0.99354034 |
| PERSONALIZED | TRUE | 2099 | 2016 | 0.010772942 | 0.01505511 |
ggplot(data=df6_dist_openedbytyp, aes(fill=CLICKED, x=TYP_CAMP, y=TOT_EVENTs)) +
geom_bar(stat="identity", fill="steelblue") +
geom_text(aes(label=TOT_CLIs), vjust=1.6, color="white", size=3.5) +
theme_minimal()
# - FAILED/FAILED by TYP_CAP
df6_dist_openedbytyp <- campaign_event_clean_final6 %>%
group_by(TYP_CAMP, FAILED) %>%
dplyr::summarize(TOT_EVENTs = n_distinct(ID_EVENT_S)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df6_overviewbytyp %>%
select(TYP_CAMP
, ALL_TOT_EVENTs = TOT_EVENTs
, ALL_TOT_CLIs = TOT_CLIs)
, by='TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(TYP_CAMP
, FAILED
, TOT_EVENTs
, TOT_CLIs
, PERCENT_EVENTs
, PERCENT_CLIs
)
purchase_ticket <- read.csv(paste0(data_dir,"raw_7_tic.csv"), sep=";")
tickets_clean <- purchase_ticket
Format columns, start with date, than categories as factor
tickets_clean <- tickets_clean %>%
mutate(TIC_DATETIME = as.POSIXct(DATETIME, format="%Y-%m-%dT%H%M%S")) %>%
mutate(TIC_HOUR = hour(TIC_DATETIME)) %>%
mutate(TIC_DATE = as.Date(TIC_DATETIME)) %>%
select(-DATETIME)
tickets_clean <- tickets_clean %>%
mutate(DIREZIONE = as.factor(DIREZIONE)) %>%
mutate(COD_REPARTO = as.factor(COD_REPARTO))
Check consistency between df1 and df7
cons_idcli_df1_df7 <- fidelity_clean %>%
select(ID_CLI) %>%
distinct() %>%
mutate(is_in_df_1 = 1) %>%
distinct() %>%
full_join(tickets_clean %>%
select(ID_CLI) %>%
distinct() %>%
mutate(is_in_df_7 = 1) %>%
distinct(), by = "ID_CLI") %>%
group_by(is_in_df_1, is_in_df_7) %>%
dplyr::summarize(NUM_ID_CLIs = n_distinct(ID_CLI)) %>%
as.data.frame()
cons_idcli_df1_df7 %>% formattable()
| is_in_df_1 | is_in_df_7 | NUM_ID_CLIs |
|---|---|---|
| 1 | 1 | 212124 |
| 1 | NA | 157348 |
We can conclude that all the data in df7 are mapped in df1 but not all the id_client are mapped in df1 are mapped in df7.
Now we proceed with a Reshape of df7
tickets_clean_final <- tickets_clean %>%
## adding day characterization ##
mutate(TIC_DATE_WEEKDAY = wday(TIC_DATE)) %>%
mutate(TIC_DATE_HOLIDAY = isHoliday("Italy", TIC_DATE)) %>%
mutate(TIC_DATE_TYP = case_when(
(TIC_DATE_WEEKDAY %in% c(6,7)) ~ "weekend",
(TIC_DATE_HOLIDAY == TRUE) ~ "holiday",
(TIC_DATE_WEEKDAY < 7) ~ "weekday",
TRUE ~ "other"))
We start with an overview of the dataset
df7_overview <- tickets_clean_final %>%
dplyr::summarize(MIN_DATE = min(TIC_DATE),
MAX_DATE = max(TIC_DATE),
TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI))
df7_overview %>% formattable()
| MIN_DATE | MAX_DATE | TOT_TICs | TOT_CLIs |
|---|---|---|---|
| 2018-05-01 | 2019-04-30 | 998035 | 212124 |
Than we start with compute some aggregation
df7_dist_direction <- tickets_clean_final %>%
group_by(DIREZIONE) %>%
dplyr::summarize(TOT_TICs = n_distinct(ID_SCONTRINO)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT_TICs = TOT_TICs/df7_overview$TOT_TICs
, PERCENT_CLIs = TOT_CLIs/df7_overview$TOT_CLIs)
## `summarise()` ungrouping output (override with `.groups` argument)
df7_dist_direction %>% formattable()
| DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|
| -1 | 90189 | 46622 | 0.09036657 | 0.2197865 |
| 1 | 907846 | 212124 | 0.90963343 | 1.0000000 |
Variable TOT_TIC
df7_dist_hour <- tickets_clean_final %>%
group_by(TIC_HOUR, DIREZIONE) %>%
dplyr::summarize(TOT_TICs = n_distinct(ID_SCONTRINO)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df7_dist_direction %>%
select(DIREZIONE
, ALL_TOT_TICs = TOT_TICs
, ALL_TOT_CLIs = TOT_CLIs)
, by = 'DIREZIONE'
) %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
df7_dist_hour %>% formattable() %>% head()
| TIC_HOUR | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| 3 | 1 | 2 | 2 | 0.000002203017 | 0.000009428448 |
| 4 | 1 | 4 | 4 | 0.000004406034 | 0.000018856895 |
| 5 | 1 | 2 | 2 | 0.000002203017 | 0.000009428448 |
| 6 | 1 | 32 | 32 | 0.000035248269 | 0.000150855160 |
| 7 | -1 | 759 | 638 | 0.008415660446 | 0.013684526618 |
| 7 | 1 | 9249 | 6309 | 0.010187851243 | 0.029742037676 |
ggplot(data=df7_dist_hour, aes(fill=DIREZIONE, x=TIC_HOUR, y=TOT_TICs)) +
geom_bar(stat="identity") +
theme_minimal()
ggplot(data=df7_dist_hour, aes(fill=DIREZIONE, x=TIC_HOUR, y=TOT_TICs)) +
geom_bar(stat="identity", position="fill" ) +
theme_minimal()
Variable COD_REPARTO
df7_dist_dep <- tickets_clean_final %>%
group_by(COD_REPARTO, DIREZIONE) %>%
dplyr::summarize(TOT_TICs = n_distinct(ID_SCONTRINO)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df7_dist_direction %>%
select(DIREZIONE
, ALL_TOT_TICs = TOT_TICs
, ALL_TOT_CLIs = TOT_CLIs)
, by = 'DIREZIONE'
) %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
df7_dist_dep %>% formattable() %>% head()
| COD_REPARTO | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| 1 | -1 | 2512 | 1928 | 0.02785262 | 0.04135387 |
| 1 | 1 | 65151 | 30531 | 0.07176437 | 0.14392997 |
| 2 | -1 | 5604 | 4598 | 0.06213618 | 0.09862297 |
| 2 | 1 | 94618 | 54055 | 0.10422252 | 0.25482737 |
| 3 | -1 | 17083 | 10835 | 0.18941334 | 0.23240101 |
| 3 | 1 | 229972 | 93224 | 0.25331609 | 0.43947880 |
ggplot(data=df7_dist_dep, aes(fill=DIREZIONE, x=COD_REPARTO, y=TOT_TICs)) +
geom_bar(stat="identity") +
theme_minimal()
ggplot(data=df7_dist_dep
, aes(fill=DIREZIONE, x=COD_REPARTO, y=TOT_TICs)) +
geom_bar(stat="identity", position="fill" ) +
theme_minimal()
Variable TIC_DATE_TYP
df7_dist_datetyp <- tickets_clean_final %>%
group_by(TIC_DATE_TYP, DIREZIONE) %>%
dplyr::summarize(TOT_TICs = n_distinct(ID_SCONTRINO)
, TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(df7_dist_direction %>%
select(DIREZIONE
, ALL_TOT_TICs = TOT_TICs
, ALL_TOT_CLIs = TOT_CLIs)
, by = 'DIREZIONE'
) %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs
, PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
df7_dist_datetyp %>% formattable() %>% head()
| TIC_DATE_TYP | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| holiday | -1 | 14522 | 10759 | 0.1610174 | 0.2307709 |
| holiday | 1 | 157868 | 81742 | 0.1738929 | 0.3853501 |
| weekday | -1 | 47294 | 28981 | 0.5243877 | 0.6216164 |
| weekday | 1 | 452679 | 153338 | 0.4986297 | 0.7228696 |
| weekend | -1 | 28373 | 19565 | 0.3145949 | 0.4196517 |
| weekend | 1 | 297299 | 125137 | 0.3274773 | 0.5899238 |
ggplot(data=df7_dist_datetyp, aes(fill=DIREZIONE, x=TIC_DATE_TYP, y=TOT_TICs)) +
geom_bar(stat="identity") +
theme_minimal()
ggplot(data=df7_dist_datetyp, aes(fill=DIREZIONE, x=TIC_DATE_TYP, y=TOT_TICs)) +
geom_bar(stat="identity", position="fill" ) +
theme_minimal()
Variable average IMPORTO_LORDO and average SCONTO per TICKET
tickets_clean_final$ID_SCONTRINO2 <- NULL
tickets_clean_final$ID_SCONTRINO2 <- as.character(tickets_clean_final$ID_SCONTRINO)
tickets_clean_final$IMPORTO_LORDO2 <- NULL
tickets_clean_final$IMPORTO_LORDO2 <- as.numeric(tickets_clean_final$IMPORTO_LORDO)
tickets_clean_final$SCONTO2 <- NULL
tickets_clean_final$SCONTO2 <- as.numeric(tickets_clean_final$SCONTO)
df7_dist_importosconto <- tickets_clean_final %>%
group_by(ID_SCONTRINO2, DIREZIONE) %>%
dplyr::summarize(IMPORTO_LORDO2 = sum(IMPORTO_LORDO2), SCONTO2 = sum(SCONTO2)) %>%
ungroup() %>%
as.data.frame()
df7_dist_avgimportosconto <- df7_dist_importosconto %>%
group_by(DIREZIONE) %>%
dplyr::summarize(AVG_IMPORTO_LORDO = mean(IMPORTO_LORDO2), AVG_SCONTO = mean(SCONTO2))
df7_dist_avgimportosconto %>% formattable() %>% head()
| DIREZIONE | AVG_IMPORTO_LORDO | AVG_SCONTO |
|---|---|---|
| -1 | 10764.16 | 5829.931 |
| 1 | 114960.70 | 21767.375 |
ggplot(data=df7_dist_importosconto %>% filter((IMPORTO_LORDO2 > -1000) & (IMPORTO_LORDO2 < 1000)), aes(color=DIREZIONE, x=IMPORTO_LORDO2)) +
geom_histogram(binwidth=10, fill="steelblue", alpha=0.5) +
theme_minimal()
ggplot(data=df7_dist_importosconto %>% filter((SCONTO2 > -250) & (IMPORTO_LORDO2 < 250)), aes(color=DIREZIONE, x=SCONTO2)) +
geom_histogram(binwidth=10, fill="steelblue", alpha=0.5) +
theme_minimal()
This is for each sales department
df7_dist_importosconto_reparto <- tickets_clean_final %>%
group_by(COD_REPARTO, DIREZIONE) %>%
dplyr::summarize(IMPORTO_LORDO2 = sum(IMPORTO_LORDO2), SCONTO2 = sum(SCONTO2)) %>%
ungroup() %>%
as.data.frame()
df7_dist_avgimportosconto_reparto <- df7_dist_importosconto_reparto %>%
group_by(DIREZIONE) %>%
dplyr::summarize(AVG_IMPORTO_LORDO = mean(IMPORTO_LORDO2)
, AVG_SCONTO = mean(SCONTO2))
df7_dist_avgimportosconto_reparto %>% formattable() %>% head()
| DIREZIONE | AVG_IMPORTO_LORDO | AVG_SCONTO |
|---|---|---|
| -1 | 69343518 | 37556834 |
| 1 | 7454758287 | 1411530303 |
ggplot(data=df7_dist_importosconto_reparto %>% filter(), aes(color=DIREZIONE, x=IMPORTO_LORDO2)) +
geom_histogram(fill="steelblue", alpha=0.5) +
theme_minimal()
ggplot(data=df7_dist_importosconto_reparto %>%filter(), aes(color=DIREZIONE, x=SCONTO2)) +
geom_histogram(fill="steelblue", alpha=0.5) +
theme_minimal()
The distribution fot Id_article
# EXPLORE average IMPORTO_LORDO and average SCONTO per ID_CLI
## compute aggregate
df7_dist_importosconto_cli <- tickets_clean_final %>%
group_by(ID_CLI, DIREZIONE) %>%
dplyr::summarize(IMPORTO_LORDO2 = sum(IMPORTO_LORDO2), SCONTO2 = sum(SCONTO2)) %>%
ungroup() %>%
as.data.frame()
df7_dist_avgimportosconto_cli <- df7_dist_importosconto_cli %>%
group_by(DIREZIONE) %>%
dplyr::summarize(AVG_IMPORTO_LORDO = mean(IMPORTO_LORDO2), AVG_SCONTO = mean(SCONTO2))
df7_dist_avgimportosconto_cli %>% formattable() %>% head()
| DIREZIONE | AVG_IMPORTO_LORDO | AVG_SCONTO |
|---|---|---|
| -1 | 20822.99 | 11277.84 |
| 1 | 492007.58 | 93159.78 |
ggplot(data=df7_dist_importosconto_cli %>% filter(), aes(color=DIREZIONE, x=IMPORTO_LORDO2)) +
geom_histogram(fill="steelblue", alpha=0.5) +
theme_minimal()
ggplot(data=df7_dist_importosconto_cli %>% filter(), aes(color=DIREZIONE, x=SCONTO2)) +
geom_histogram(fill="steelblue", alpha=0.5) +
theme_minimal()
The Recency, Frequency, and Monetary (RFM) approach is a method to identify customers who are more likely to respond to new offers. The RFM model is based on three quantitative factors.1
The main purpouse of RFM model after the segmentation is to give some possible marketing actions to the decision-makers. This are possible outcome of a RFM analysis:
We set the last purchase at 31/12/2019.
FIrst we do some step to prepare the data.
# We start calculate the transiction
dataset_RFM<-tickets_clean_final
df_receipt<-dataset_RFM[, c("ID_CLI", "ID_SCONTRINO")]
df_orders <- df_receipt %>% count(ID_CLI)
colnames(df_orders)[c(2)] <- c("N_ORDINI")
number_order_tot <- df_orders[!duplicated(df_orders[ , c("ID_CLI")]),]
# We start calculate the recency
df_date=dataset_RFM[, c("ID_CLI", "TIC_DATE")]
df_last_date <- df_date %>% group_by(ID_CLI) %>% dplyr::summarise(max(TIC_DATE))
df_last_date[[3]]="2019-12-31"
colnames(df_last_date)[c(2,3)] <- c("last_buy","end_2019")
df_last_date$end_2019 <- as.Date(df_last_date$end_2019)
df_last_date %>% formattable() %>% head()
| ID_CLI | last_buy | end_2019 |
|---|---|---|
| 5 | 2018-11-23 | 2019-12-31 |
| 18 | 2018-11-23 | 2019-12-31 |
| 23 | 2019-02-20 | 2019-12-31 |
| 28 | 2018-10-11 | 2019-12-31 |
| 30 | 2018-07-23 | 2019-12-31 |
| 32 | 2019-04-02 | 2019-12-31 |
df_last_date$days_from_last_buy <- difftime(df_last_date$end_2019, df_last_date$last_buy, units = c("days"))
RECENCY_DAYS=df_last_date[, c("ID_CLI", "days_from_last_buy")]
# At last we start with the Monetary
df_importo_lordo <- dataset_RFM[, c("ID_CLI", "IMPORTO_LORDO2")]
df_importo_tot <- df_importo_lordo %>% group_by(ID_CLI) %>% dplyr::summarise(sum(IMPORTO_LORDO2))
colnames(df_importo_tot)[c(2)] <- c("total_amount")
df_importo_tot <- df_importo_tot %>% filter(total_amount > 0)
REVENUE<-df_importo_tot
df_merge <- merge(number_order_tot, REVENUE, by = "ID_CLI")
RFM <- merge(df_merge, RECENCY_DAYS, by = "ID_CLI")
rfm_data_customer <- merge(RFM, df_last_date, by = "ID_CLI")
colnames(rfm_data_customer)[c(1,2,3,4,5)] <- c("customer_id","number_of_orders","revenue","recency_days","analysis_date")
rfm_data_customer %>% formattable() %>% head()
| customer_id | number_of_orders | revenue | recency_days | analysis_date | end_2019 | days_from_last_buy.y |
|---|---|---|---|---|---|---|
| 5 | 6 | 106351 | 403 days | 2018-11-23 | 2019-12-31 | 403 days |
| 18 | 26 | 662706 | 403 days | 2018-11-23 | 2019-12-31 | 403 days |
| 23 | 64 | 1597683 | 314 days | 2019-02-20 | 2019-12-31 | 314 days |
| 28 | 3 | 91478 | 446 days | 2018-10-11 | 2019-12-31 | 446 days |
| 30 | 18 | 403492 | 526 days | 2018-07-23 | 2019-12-31 | 526 days |
| 32 | 31 | 739991 | 273 days | 2019-04-02 | 2019-12-31 | 273 days |
In order to perform RFM analysis I used a famous library called rfm.2 So we associate all the customer to a segment as https://cran.r-project.org/web/packages/rfm/vignettes/rfm-customer-level-data.html#segments suggest.
analysis_date <- lubridate::as_date('2019-04-30')
rfm_result <- rfm_table_customer(rfm_data_customer, customer_id, number_of_orders, recency_days, revenue, analysis_date)
rfm_result_df <- rfm_result$rfm
rfm_result_df %>% formattable() %>% head()
| customer_id | recency_days | transaction_count | amount | recency_score | frequency_score | monetary_score | rfm_score |
|---|---|---|---|---|---|---|---|
| 5 | 403 days | 6 | 106351 | 2 | 2 | 2 | 222 |
| 18 | 403 days | 26 | 662706 | 2 | 4 | 4 | 244 |
| 23 | 314 days | 64 | 1597683 | 4 | 5 | 5 | 455 |
| 28 | 446 days | 3 | 91478 | 2 | 2 | 2 | 222 |
| 30 | 526 days | 18 | 403492 | 1 | 4 | 4 | 144 |
| 32 | 273 days | 31 | 739991 | 4 | 5 | 5 | 455 |
rfm_heatmap(rfm_result)
rfm_bar_chart(rfm_result)
rfm_order_dist(rfm_result)
rfm_rf_plot(rfm_result)
rfm_histograms(rfm_result)
## Warning: attributes are not identical across measure variables;
## they will be dropped
segment_names <- c("Champions", "Loyal Customers", "Potential Loyalist",
"New Customers", "Promising", "Need Attention", "About To Sleep",
"At Risk", "Can't Lose Them", "Lost")
recency_lower <- c(4, 2, 3, 4, 3, 2, 2, 1, 1, 1)
recency_upper <- c(5, 5, 5, 5, 4, 3, 3, 2, 1, 2)
frequency_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
frequency_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
monetary_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
monetary_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
customers_segmentation <- rfm_segment(rfm_result, segment_names, recency_lower, recency_upper, frequency_lower, frequency_upper, monetary_lower, monetary_upper)
# Look the Plot
rfm_plot_median_recency(customers_segmentation)
rfm_plot_median_frequency(customers_segmentation)
rfm_plot_median_monetary(customers_segmentation)
Churn variable in marketing problem is important to know. Is just a indicato that give us the idea if our customers are leaving the companies. About this reason is important for the economic profits of companies to know this rate. If companies can predict it, can also learn more about the customers behaviour and understand which of his services is weak. In order to do that they can buil a supervised model with machine learning approach:
dataframe_holdout_method <- tickets_clean_final %>% filter(IMPORTO_LORDO2 > 0,TIC_DATE < as.Date("1/1/2019",format = "%d/%m/%Y"), TIC_DATE > as.Date("01/10/2018",format = "%d/%m/%Y"))
clients_no_churn <- data.frame(ID_CLI = unique(dataframe_holdout_method$ID_CLI), CHURN = 0)
analysis_date_churn <- lubridate::as_date('2019-01-01')
rfm_result_churn<- rfm_table_customer(rfm_data_customer, customer_id, number_of_orders, recency_days, revenue, analysis_date_churn)
churn_df_complete <- rfm_segment(rfm_result, segment_names, recency_lower, recency_upper, frequency_lower, frequency_upper, monetary_lower, monetary_upper)
churn_df_complete <- churn_df_complete[, c("customer_id", "segment", "transaction_count", "recency_days", "amount")]
colnames(churn_df_complete)[c(1)] <- c("ID_CLI")
churn_final <- left_join(churn_df_complete, clients_no_churn, by = "ID_CLI")
churn_final[is.na(churn_final)] <- 1
churn_final$CHURN <- as.factor(churn_final$CHURN)
churn_final$segment <- as.factor(churn_final$segment)
We are gonna use the famous models to try to find the best model:
train_test_split <- createDataPartition(churn_final$CHURN, p = .80, list = FALSE, times = 1)
train <- churn_final[train_test_split,]
test <- churn_final[-train_test_split,]
# Logistic Regression
logistic <- train(CHURN ~ segment + transaction_count + recency_days + amount, data = train, method = "glm")
# Decision Tree
tree <- rpart(CHURN ~ segment + transaction_count + recency_days + amount, data = train)
# Random Forest
tree_rf <- randomForest(CHURN ~ segment + transaction_count + recency_days + amount, data = train, ntree = 200)
We start the prediction on the df_test and we get the result in different ways. We are gonna get the Accuracy, Lift, ROC e AUC.
# Test
prediction_rf <- predict(tree_rf, test, type = "class")
confusionMatrix(prediction_rf, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15782 2785
## 1 3312 20545
##
## Accuracy : 0.8563
## 95% CI : (0.8529, 0.8596)
## No Information Rate : 0.5499
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7089
##
## Mcnemar's Test P-Value : 0.00000000001624
##
## Sensitivity : 0.8265
## Specificity : 0.8806
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.8612
## Prevalence : 0.4501
## Detection Rate : 0.3720
## Detection Prevalence : 0.4377
## Balanced Accuracy : 0.8536
##
## 'Positive' Class : 0
##
prediction_logistic <- predict(logistic, test, type = "raw")
confusionMatrix(prediction_logistic, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15478 5472
## 1 3616 17858
##
## Accuracy : 0.7858
## 95% CI : (0.7818, 0.7897)
## No Information Rate : 0.5499
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.571
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8106
## Specificity : 0.7655
## Pos Pred Value : 0.7388
## Neg Pred Value : 0.8316
## Prevalence : 0.4501
## Detection Rate : 0.3648
## Detection Prevalence : 0.4938
## Balanced Accuracy : 0.7880
##
## 'Positive' Class : 0
##
prediction_decision_tree <- predict(tree, test, type = "class")
prediction_dt <- unlist(prediction_decision_tree)
confusionMatrix(prediction_dt, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16104 2914
## 1 2990 20416
##
## Accuracy : 0.8608
## 95% CI : (0.8575, 0.8641)
## No Information Rate : 0.5499
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.7188
##
## Mcnemar's Test P-Value : 0.329
##
## Sensitivity : 0.8434
## Specificity : 0.8751
## Pos Pred Value : 0.8468
## Neg Pred Value : 0.8723
## Prevalence : 0.4501
## Detection Rate : 0.3796
## Detection Prevalence : 0.4483
## Balanced Accuracy : 0.8593
##
## 'Positive' Class : 0
##
# Accuracy
accuracy_df <- as.data.frame(t(cbind(confusionMatrix(prediction_logistic,test$CHURN)$overall[1],confusionMatrix(prediction_rf, test$CHURN)$overall[1],
confusionMatrix(prediction_dt, test$CHURN)$overall[1])))
accuracy_df <- as.data.frame(cbind(c("Logistic","Random Forest","Random Tree"),
accuracy_df))
colnames(accuracy_df) <- c("Models", "Accuracy")
accuracy_df %>% formattable()
| Models | Accuracy |
|---|---|
| Logistic | 0.7857816 |
| Random Forest | 0.8562842 |
| Random Tree | 0.8608335 |
# Probability
prob_log = predict(logistic, test, "prob")[,1]
prob_tree = predict(tree, test, "prob")[,1]
prob_rf = predict(tree_rf, test, "prob")[,1]
data_classification = as.data.frame(cbind(prob_tree, prob_rf, prob_log))
data_classification = cbind(data_classification, test$CHURN)
colnames(data_classification) <- c("p_tree", "p_rf", "p_log", "churn")
# Lift
lift_log = gain_lift(data = data_classification, score = 'p_log', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 14.98 1.50 0.822205506638208
## 2 20 32.60 1.63 0.755643603064053
## 3 30 53.10 1.77 0.691386010335168
## 4 40 70.01 1.75 0.601596508334041
## 5 50 81.70 1.63 0.492578974035976
## 6 60 93.00 1.55 0.371189499886681
## 7 70 98.29 1.40 0.219784306884891
## 8 80 100.00 1.25 0.127898969290005
## 9 90 100.00 1.11 0.000000005123033
## 10 100 100.00 1.00 0.000000001646095
lift_tree = gain_lift(data = data_classification, score = 'p_tree', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 49.02 4.90 1.0000000
## 2 20 49.02 2.45 1.0000000
## 3 30 78.98 2.63 0.7325143
## 4 40 78.98 1.97 0.7325143
## 5 50 95.49 1.91 0.3734855
## 6 60 100.00 1.67 0.1166481
## 7 70 100.00 1.43 0.1166481
## 8 80 100.00 1.25 0.0000000
## 9 90 100.00 1.11 0.0000000
## 10 100 100.00 1.00 0.0000000
lift_rf = gain_lift(data = data_classification, score = 'p_rf', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 42.85 4.28 1.000
## 2 20 44.71 2.24 0.995
## 3 30 63.09 2.10 0.840
## 4 40 78.12 1.95 0.600
## 5 50 89.42 1.79 0.315
## 6 60 96.52 1.61 0.080
## 7 70 100.00 1.43 0.000
## 8 80 100.00 1.25 0.000
## 9 90 100.00 1.11 0.000
## 10 100 100.00 1.00 0.000
# Auc Roc
pred_rf <- prediction(as.numeric(prediction_rf), as.numeric(test$CHURN))
performance_rf <- performance(pred_rf,"tpr","fpr")
plot(performance_rf,colorize=TRUE)
auc.performance_rf <- performance(pred_rf, measure = "auc")
auc.performance_rf@y.values
## [[1]]
## [1] 0.8535841
pred_rt <- prediction(as.numeric(prediction_dt), as.numeric(test$CHURN))
performance_rt <- performance(pred_rt,"tpr","fpr")
plot(performance_rt,colorize=TRUE)
auc.performance_rt <- performance(pred_rt, measure = "auc")
auc.performance_rt@y.values
## [[1]]
## [1] 0.8592514
pred_lg <- prediction(as.numeric(prediction_logistic), as.numeric(test$CHURN))
performance_lg <- performance(pred_lg,"tpr","fpr")
plot(performance_lg,colorize=TRUE)
auc.performance_lg <- performance(pred_lg , measure = "auc")
auc.performance_lg@y.values
## [[1]]
## [1] 0.7880367
Market Basket Analysis is used in marketin to understand the relations beetween different products. With the use of different algorithms is possible for retailer to uncover different associations berween produts. The algoritims use the transaction and they look the frequency with different combination of items are bought.
Association Rules are widely used to analyze retail basket or transaction data, and are intended to identify strong rules discovered in transaction data using measures of interestingness, based on the concept of strong rules.3
data_market_basket_analysis <- tickets_clean_final %>% filter(IMPORTO_LORDO2 > 0)
data_market_basket_analysis$ID_CLI_TIC_DATETIME <- paste0(data_market_basket_analysis$ID_CLI, "-", data_market_basket_analysis$TIC_DATETIME)
data_market_basket_analysis <- data_market_basket_analysis %>% select(ID_CLI_TIC_DATETIME, ID_ARTICOLO)
data_market_basket_analysis$ID_ARTICOLO <- as.factor(data_market_basket_analysis$ID_ARTICOLO)
data_market_basket_analysis$ID_CLI_TIC_DATETIME <- as.factor(data_market_basket_analysis$ID_CLI_TIC_DATETIME)
write.table(data_market_basket_analysis, file = tmp <- file(), row.names = FALSE)
itemTransactions <- read.transactions(tmp, format = "single", header = TRUE, cols = c("ID_CLI_TIC_DATETIME", "ID_ARTICOLO"))
close(tmp)
item_rules <- apriori(itemTransactions, parameter = list(supp = 0.001, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 998
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[96347 item(s), 998032 transaction(s)] done [2.66s].
## sorting and recoding items ... [282 item(s)] done [0.04s].
## creating transaction tree ... done [0.24s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object ... done [0.25s].
write(item_rules, file = "data.csv", sep = ",")
df <- read.csv("/Volumes/HDD_Ale/Project Digital Marketing/PROGETTO-ALESSANDRO/data.csv")
df[order(df$count, decreasing = TRUE), ] %>% formattable() %>% head()
| rules | support | confidence | coverage | lift | count | |
|---|---|---|---|---|---|---|
| 11 | {32078935,32079082} => {32079103} | 0.002024985 | 0.8313451 | 0.002435794 | 169.6400 | 2021 |
| 18 | {32078795,32079082} => {32079103} | 0.001930800 | 0.8660674 | 0.002229387 | 176.7252 | 1927 |
| 19 | {32078795,32079103} => {32079082} | 0.001930800 | 0.8352839 | 0.002311549 | 178.5097 | 1927 |
| 4 | {36298381} => {36298353} | 0.001851644 | 0.8461538 | 0.002188307 | 336.5837 | 1848 |
| 17 | {32079082,32842551} => {32079103} | 0.001756457 | 0.8975934 | 0.001956851 | 183.1582 | 1753 |
| 10 | {32078795,32078935} => {32079103} | 0.001516985 | 0.8053191 | 0.001883707 | 164.3292 | 1514 |
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
Olson D. L. (2009) Recency Frequency and Monetary Model, University of Nebraska at Lincoln↩
A work by Alessandro Pontini
a.pontini1@campus.unimib.it